Description

This R notebook is a bioinformatics pipeline to analyze fitness data obtained from a barcoded transposon library in Ralstonia eutropha a.k.a. Cupriavidus necator. For background and details regarding the method, see Wetmore at al., mBio, 2015 and Price et al., Nature, 2018).

Libraries

# optionally install repos from github
# devtools::install_github("m-jahn/lattice-tools")
# devtools::install_github("m-jahn/R-tools")
library(lattice)
library(latticeExtra)
library(latticetools)
library(tidyverse)
library(dendextend)
library(Rtools)
library(colorspace)

Overview of barcode/transposon read counts

Data import and processing

Read in the main data tables with A) reads per barcode and sample (‘pool counts’), and B) the fitness tables. Tables were obtained by processing sequencing data with a custom BarSeq pipeline. The 32 generation sequencing samples are removed due to the low read count in the continuous samples.

Summary statistics

Overview about the number of reads per barcode, barcodes per gene and so on. Around 8-10 M reads were mapped on average, per sample.

# Number of total mapped reads
df_counts %>% group_by(sample) %>%
  summarize(n_million_reads = sum(n_reads)/10^6) %>%
  barchart(factor(sample) ~ n_million_reads, .,
    par.settings = custom.colorblind(),
    horizontal = TRUE, border = NULL,
    scales = list(y = list(cex = 0.7)),
    panel = function(x, y, ...) {
      panel.grid(h = -1, v = -1, col = grey(0.9))
      panel.barchart(x, y, ...)
    }
  )

Distribution of number of reads per barcode. There is a sufficient number of reads for quantification, on average log2(n) = 5 = 32 reads per barcode.

plot_reads_per_bc <- histogram(~ log2(n_reads) | sample,
  df_counts, as.table = TRUE, layout = c(8,6),
  par.settings = custom.colorblind(),
  between = list(x = 0.5, y = 0.5),
  xlab = expression("log"[2]*" reads per barcode"),
  scales = list(alternating = FALSE),
  panel = function(x, ...) {
    panel.grid(h = -1, v = -1, col = grey(0.9))
    panel.histogram(x, border = "white", ...)
  }
)

print(plot_reads_per_bc)

Similarly to the above, this is an overview about the number of barcodes per gene as a histogram. This distribution is the same for all conditions and replicates. The second plot is the number of reads per gene, averaged as median over all conditions (excluding 0 time point where counts were averaged by BarSeq pipeline). The average reads per gene are log2(n) = 8, which translates to n = 2^8 or around 256 reads per gene.

plot_reads_per_bc <- histogram(~ Strains_per_gene,
  df_fitness %>% select(locus_tag, Strains_per_gene) %>% 
    distinct %>% filter(Strains_per_gene < 40), 
  par.settings = custom.colorblind(), breaks = 20, xlim=c(-2, 42),
  xlab = expression("mutants per gene"),
  panel = function(x, ...) {
    panel.grid(h = -1, v = -1, col = grey(0.9))
    panel.histogram(x, border = "white", ...)
    panel.ablineq(v = mean(x, na.rm = TRUE),
      fontfamily = "FreeSans", col = grey(0.5), lwd = 2, lty = 2)
  }
)

plot_reads_per_gene <- histogram(~ log2(reads_per_gene_median),
  df_fitness %>% filter(Time != 0) %>% group_by(locus_tag) %>%
    summarize(reads_per_gene_median = median(Counts)),
  par.settings = custom.colorblind(), breaks = 20,
  xlab = expression("log"[2]*" reads per gene (med)"),
  panel = function(x, ...) {
    panel.grid(h = -1, v = -1, col = grey(0.9))
    panel.histogram(x, border = "white", ...)
    panel.ablineq(v = mean(x[!is.infinite(x)]),
      fontfamily = "FreeSans", col = grey(0.5), lwd = 2, lty = 2)
  }
)

print(plot_reads_per_bc, split = c(1,1,2,1), more = TRUE)
print(plot_reads_per_gene, split = c(2,1,2,1))

Gene fitness analysis

Depletion over time (generations)

We can plot log2 FC or normalized gene fitness over generations. For this type of overview it is best to summarize individual replicates (4x) to the mean or median, per time point and condition. We also add genome annotation to the summary table. The plots shows that depletion of some strains is so strong already at 8 generations that fitness/log2 FC could not be quantified for 16 generations due to missing read counts. It is best to focus on 8 generations as because it provides a more complete picture.

df_fitness_summary <- df_fitness %>%
  group_by(locus_tag, scaffold, Time, Condition, Substrate, Strains_per_gene) %>%
  summarize(
    Norm_fg_median = median(Norm_fg, na.rm = TRUE),
    log2FC_median = median(log2FC, na.rm = TRUE),
    tstat_median = median(t, na.rm = TRUE)
  ) %>%
  left_join(df_ref)
`summarise()` has grouped output by 'locus_tag', 'scaffold', 'Time', 'Condition', 'Substrate'. You can override using the `.groups` argument.
Joining, by = "locus_tag"
plot_hist_log2FC <- df_fitness_summary %>%
  filter(all(!is.infinite(log2FC_median))) %>%
  group_by(Condition, Substrate) %>%
  slice(1:2000) %>%
  
  xyplot(log2FC_median ~ Time | Substrate * Condition, .,
    groups = locus_tag, as.table = TRUE,
    col = stdcol[1], alpha = 0.2, layout = c(3, 2),
    xlab = "", ylab = expression("log"[2]*" FC"),
    par.settings = custom.colorblind(), type = c("l"),
    between = list(x = 0.5, y = 0.5),
    scales = list(alternating = FALSE), lwd = 2,
    panel = function(x, y, ...) {
      panel.grid(h = -1, v = -1, col = grey(0.9))
      panel.xyplot(x, y, ...)
    }
  )

plot_hist_normfg <- df_fitness_summary %>%
  filter(all(!is.infinite(log2FC_median))) %>%
  group_by(Condition, Substrate) %>%
  slice(1:2000) %>%
  
  xyplot(Norm_fg_median ~ Time | Substrate * Condition, .,
    groups = locus_tag, as.table = TRUE,
    col = stdcol[2], alpha = 0.2, layout = c(3, 2),
    xlab = "generations", ylab = "fitness",
    par.settings = custom.colorblind(), type = c("l"),
    between = list(x = 0.5, y = 0.5),
    scales = list(alternating = FALSE), lwd = 2,
    panel = function(x, y, ...) {
      panel.grid(h = -1, v = -1, col = grey(0.9))
      panel.xyplot(x, y, ...)
    }
  )

print(plot_hist_log2FC, position = c(0,0.47,1,1), more = TRUE)
print(plot_hist_normfg, position = c(0,0,1,0.53))

As sort of an internal control, we compare the gene fitness obtained by a complex procedure to the log2 FC of read counts, which is a very simple measure of ‘fitness’. The two variables correlate well all tested conditions and substrates.

df_fitness_summary %>%
  filter(Time == 8, all(!is.infinite(log2FC_median))) %>%
  
  xyplot(Norm_fg_median ~ log2FC_median |  Substrate * Condition, .,
    as.table = TRUE, col = stdcol[5], pch = 19,
    alpha = 0.4, cex = 0.6,
    layout = c(3, 2), xlim = c(-8,3), ylim = c(-6,5),
    xlab = expression("log"[2]*" FC"), ylab = "fitness",
    par.settings = custom.colorblind(),
    between = list(x = 0.5, y = 0.5),
    scales = list(alternating = FALSE),
    panel = function(x, y, ...) {
      panel.grid(h = -1, v = -1, col = grey(0.9))
      panel.xyplot(x, y, ...)
      panel.lmlineq(x, y, fontfamily = "FreeSans", r.squared = TRUE, lwd = 1.5, 
        col.text = 1, pos = 3, offset = 5, ...)
      panel.abline(a = 2, b = 1, col = grey(0.5), lty = 2, lwd = 1.5)
    }
  )

Comparing gene fitness between conditions

Similarly to Figure 2 in Wetmore et al, mBIO, 2015, we can investigate condition-dependent gene fitness by comparing conditions and substrates one-on-one. The correlation between substrates and growth regimes (continuous, pulsed) is quite different. The strongest correlation does exist for the pulsed vs continuous regime for each substrate (R = 0.64 to 0.87). And for the pulsed conditions, comparison between substrates (R = 0.62 to 0.72).

df_fitness_comp <- df_fitness_summary %>% filter(Time == 8) %>%
  group_by(locus_tag) %>% mutate(tstat_median = min(tstat_median)) %>%
  select(locus_tag, Condition, Substrate, Norm_fg_median, gene_name, COG_Process, tstat_median) %>%
  unite(Condition, Condition, Substrate) %>%
  pivot_wider(names_from = Condition, values_from = Norm_fg_median) %>%
  filter(!is.na(locus_tag))

custom_splom(df_fitness_comp %>% ungroup %>% 
  select(matches("conti|pulse")), col = grey(0.4, 0.4))

We can also compare selected conditions directly to identify genes enriched or depleted in several conditions.

# generalized plotting function
plot_fitness_comp <- function(data, vars){
  xyplot(get(vars[2]) ~ get(vars[1]), data,
    groups = abs(get(vars[1])-get(vars[2])) > 2, #tstat_median < -3
    par.settings = custom.colorblind(), col = stdcol[c(5,1)],
    pch = 19, alpha = 0.5, cex = 0.7, aspect = 1,
    xlim = c(-7, 4), ylim = c(-7, 4),
    xlab = vars[1], ylab = vars[2],
    panel = function(x, y, ...) {
      panel.grid(h = -1, v = -1, col = grey(0.9))
      panel.abline(v = 0, h = 0, col = grey(0.5), lty = 2, lwd = 1.5)
      panel.abline(a = 0, b = 1, col = grey(0.5), lty = 2, lwd = 1.5)
      panel.abline(a = 2, b = 1, col = grey(0.5), lty = 2, lwd = 1.5)
      panel.abline(a = -2, b = 1, col = grey(0.5), lty = 2, lwd = 1.5)
      panel.xyplot(x, y, ...)
      panel.key(...)
    }
  )
}

plot_fitness_comp(df_fitness_comp, c("continuous_formate", "pulse_formate")) %>%
  print(position = c(0,0.5,0.5,1), more = TRUE)
plot_fitness_comp(df_fitness_comp, c("continuous_fructose", "pulse_fructose")) %>%
  print(position = c(0.5,0.5,1,1), more = TRUE)
plot_fitness_comp(df_fitness_comp, c("continuous_succinate", "pulse_succinate")) %>%
  print(position = c(0,0,0.5,0.5), more = TRUE)
plot_fitness_comp(df_fitness_comp, c("continuous_formate", "continuous_fructose")) %>%
  print(position = c(0.5,0,1,0.5), more = TRUE)

Differential fitness of selected genes

Most genes correlate in fitness value between conditions. That means, only some genes have a condition-specific fitness effect, i.e. increase or decrease fitness of the respective strain in one substrate or growth regime specifically. This section investigates specific genes and their functions that show such differential fitness between conditions. A simple comparison of one condition versus another will not be helpful as there two many possible combinations. To identify interesting sets of genes, cluster analysis can be performed.

# generate colorpalette for heatmap
heat_cols <- colorspace::diverging_hcl(n = 7, h = c(260, 0), c = 100, l = c(50, 90))

# create a matrix from wide fitness data for plotting heatmap
mat_heatmap <- df_fitness_comp %>% ungroup %>%
  filter(if_any(.cols = matches("conti|pulse"), ~ !between(., -2, 2))) %>%
  select(matches("locus_tag|conti|pulse")) %>%
  # filter out NA rows, and replace extreme values
  drop_na %>% mutate(across(matches("conti|pulse"), 
    function(x) {y = replace(x, x > 6, 6); replace(y, y < -6, -6)})) %>%
  column_to_rownames(var = "locus_tag") %>%
  as.matrix

# create cluster for reordering
mat_cluster <- mat_heatmap %>% dist %>% hclust(method = "ward.D")
mat_heatmap <- mat_heatmap[order.dendrogram(as.dendrogram(mat_cluster)), c(1,4,2,5,3,6)]

# plot heatmap
plot_heatmap <- levelplot(mat_heatmap,
  par.settings = custom.colorblind(),
  col.regions = colorRampPalette(heat_cols)(16),
  at = seq(-6, 6, 1), aspect = "fill",
  xlab = "", ylab = "", scales = list(x = list(draw = FALSE)),
  panel = function(x, y, z, ...) {
    panel.levelplot(x, y, z, ...)
    panel.abline(h = 1:5+0.5, col = "white", lwd = 1.5)
  }
)

print(plot_heatmap)

Clustering and silhouette analysis revealed that we have 4 to 7 clusters with acceptable separation, see the plot below. Of these, several clusters stick out:

  1. mutants specifically depleted in formate conditions = essential for formate
  2. mutants depleted in all conditions = essential in minimal medium
  3. mutants enriched in fructose growth, neutral on other substrates
  4. mutants enriched in continuous regime, mostly neutral in pulsed regime
  5. mutants slightly depleted in continuous succinate - low priority
  6. mutants slightly depleted in continuous fructose - low priority
  7. mutants slightly depleted in various conditions - low priority
silhouetteResult <- Rtools::silhouette_analysis(mat_heatmap, mat_cluster, 2:20)

Call:
hclust(d = ., method = "ward.D")

Cluster method   : ward.D 
Distance         : euclidean 
Number of objects: 310 

Silhouette analysis finished for clusters 2 to 20
silhouetteResult$plot.summary


# plot colored dendrogram
plot(color_branches(
  mat_cluster, k = 7,
  groupLabels = TRUE,
  col = stdcol[1:7]
))


# collect names of enriched/depleted genes
list_enriched <- vegan::cutreeord(mat_cluster, k = 7)
Registered S3 method overwritten by 'vegan':
  method     from      
  rev.hclust dendextend
list_enriched %>% table
.
 1  2  3  4  5  6  7 
34 40 14 27 80 67 48 

Enrichment of mutants over time (increased fitness)

One of the most puzzling results is the appearance of (very) fast growing mutants in some conditions. These mutants quickly enrich over only 16 generations to a volume of up to 20% of all library mutants, they “take over the culture”. This phenomenon was not observed previously in experiments with the Synechocystis sp. PCC6803 CRISPRi repression library (Yao et al., Nature Communications, 2020). The CRISPRi library is based on the same principle of depletion and enrichment of mutants depending on the associated fitness.

Strains enriched on fructose

What is the identity of the enriched genes? Which ones are enriched in several conditions? First we identify the mutants highly and specifically enriched on fructose, predominantly in the continuous growth regime.

df_fitness_summary %>%
  filter(locus_tag %in% names(subset(list_enriched, list_enriched == 3))) %>%
  filter(Time == 16) %>%
  barchart_fitness() %>%
  print(position = c(0,0,0.6,1), more = TRUE)

df_fitness_summary %>%
  filter(locus_tag %in% names(subset(list_enriched, list_enriched == 3))) %>%
  linechart_fitness() %>%
  print(position = c(0.58,0,1,1))

Mechanisms enabling faster growth on fructose

The mechanism for increased growth on fructose seems clearly different from the mechanism for faster growth in all other (mainly continuous) conditions. For the fructose set, several genes are functionally linked suggesting a similar mechanism of action,

  1. Genes H16_B0517 (alcohol dehydrogenase) and H16_B1917 (aldehyde dehydrogenase): According to STRING DB, the genes are co-located i.e. either in direct proximity or in the same operon in alpha-, beta- and gamma-proteobacteria, and even cyanobcateria. BioCyc has 17 possible iso-enzymes in Cupriavidus for the 2-enzyme pathway “ethanol degradation 1”: ethanol –> acetaldehyde –> acetyl-CoA (reverse EtOH fermentation, obtain 1 NADH per reaction) catalyzed by both genes. What could be the mechanism for improved growth/yield on fructose? Chemostats are more yield- than growth selective, this pathway is probably functional and secretes EtOH as a fermentation product; KO of this pathway probably increases yield.

  2. Two genes are involved in nitrate respiration/reduction, PHG269 or narK, and H16_B2087, a NarL-family response regulator. NarK is a nitrate transporter, and NarL senses nitrate and then activates transcription of its targets by binding the DNA. According to this paper on NarL by Ruanto et al., 2020, “the Regulon DB database [11] for transcription regulation in E. coli lists 26 gene regulatory regions where NarL has a direct effect on transcript initiation, and it functions as an activator at 11 of these”. It is not clear what effect the KO of nitrate transport could have on growth or yield, nitrate is not part of the medium.

  3. Genes involved in translation/protein folding/sulphur metabolism:

    • H16_B2521, Alpha-ketoglutarate-dependent taurine dioxygenase; taurine is a sulphur carrier
    • H16_A2861, Glutathione S-transferase; Posttranslational modification, protein turnover, chaperones
    • H16_A1138, Thioredoxin; Posttranslational modification, protein turnover, chaperones
    • H16_A1336, yogA/tdcF, putative translation initiation inhibitor,yjg Ffamily; Translation, ribosomal structure and biogenesis
    • H16_A2553, cO, DNA repair protein RecO; Involved in DNA repair and RecF pathway recombination
  4. Several other gene products are presumably regulatory proteins:

    • 16_A0310, transcriptional regulator, GntR-family; Transcription
    • H16_B2215, HTH lysR-type domain-containing protein; helix turn helix DNA binding motif

Strains enriched on formate

Here, the obvious question with biotechnological importance is to find mutants with higher (specific) tolerance to formate, regardless if it’s pulsed or continuously supplied. We explicitly exclude mutants beneficial for other growth conditions too, which are dealt with in the next section. The clustering results of the previous section could not identify formate-specific mutants enriched after 8 generations. We need to lower the filter threshold to detect genes that have a more subtle enrichment (fitness >= 2 for formate, and <= 1 for all other conditions) after 16 generations.

list_fitness_formate <- df_fitness_summary %>%
  group_by(locus_tag, Substrate, Time) %>%
  mutate(formate_hit = case_when(
    Substrate == "formate" & max(Norm_fg_median) >= 2 ~ TRUE,
    Substrate != "formate" & max(Norm_fg_median) <= 1 ~ TRUE,
    TRUE ~ FALSE)
  ) %>% group_by(locus_tag, Time) %>%
  filter(all(formate_hit), n() == 6) %>% 
  pull(locus_tag) %>% unique

df_fitness_summary %>%
  filter(Time == 16, locus_tag %in% list_fitness_formate) %>%
  barchart_fitness() %>%
  print(position = c(0,0,0.6,1), more = TRUE)

df_fitness_summary %>%
  filter(locus_tag %in% list_fitness_formate) %>%
  linechart_fitness() %>%
  print(position = c(0.58,0,1,1))

Mechanisms enabling faster growth on formate

According to STRING DB there are no obvious relationships between these genes except for the pair PHG388 (pcaI, Oxoadipate CoA transferase alpha subunit) and H16_B1583 (pcaD, 3-Oxoadipate enol-lactone hydrolase). Both genes are part of the catechol degradation pathway that is itself a larger part of benzene degradation. The gene products catalyze two sequential reactions 3-oxoadipate-enol-lactone –> 3-oxoadipate (pcaD) –> 3-oxoadipyl-CoA (pcaI). The next reaction catalyzed by four isoenzymes makes succinyl-CoA which ends up in the Citrate cycle. How the KO of this pathway would improve tolerance to formate is unclear.

Strains enriched on succinate

This is similar to the analysis above for formate, with the exception that many mutants associated with higher fitness in succinate conditions enrich only after 16 generations, and only in the continuous regime. This seems be an artifact of read compression. We therefore use the same thresholds as for formate (fitness >= 2 for succinate, and <= 1 for all other conditions) but after 8 instead of 16 generations.

list_fitness_succinate <- df_fitness_summary %>%
  filter(Time == 8) %>%
  group_by(locus_tag, Substrate) %>%
  mutate(succinate_hit = case_when(
    Substrate == "succinate" & max(Norm_fg_median) >= 2 ~ TRUE,
    Substrate != "succinate" & max(Norm_fg_median) <= 1 ~ TRUE,
    TRUE ~ FALSE)
  ) %>% group_by(locus_tag) %>%
  filter(all(succinate_hit), n() == 6, locus_tag != "H16_A2912") %>% 
  pull(locus_tag) %>% unique

df_fitness_summary %>%
  filter(Time == 16, locus_tag %in% list_fitness_succinate) %>%
  barchart_fitness() %>%
  print(position = c(0,0,0.6,1), more = TRUE)

df_fitness_summary %>%
  filter(locus_tag %in% list_fitness_succinate) %>%
  linechart_fitness() %>%
  print(position = c(0.58,0,1,1))

Mechanisms enabling faster growth on succinate

A group of five highly related genes shows significant enrichment in (continuous) succinate: H16_2096 (dppA1), H16_A2406 (dppD2), H16_A2407 (dppC2), H16_A2408 (dppB2), and H16_A2409, dppA2. Alternative names are yejABCDE. All of these proteins are part of an ABC transporter of the PepT family. Quote from PFAM about this type of protein/domain: “All characterised members appear to be involved in the transport of oligopeptides or dipeptides. Some are important for sporulation or antibiotic resistance. Some dipeptide transporters also act on the heme precursor delta-aminolevulinic acid. The enrichment seems to be highly reproducible: each of these genes has 7 to 15 different insertion mutants, all showing the same average pattern. At least two more genes are also involved in (amino acid?) transport, H16_A3284 and H16_A2521. The latter one is a”D-amino acid transferase (D-AAT), required by bacteria to catalyse the synthesis of D-glutamic acid and D-alanine" (PFAM). Supposed mechanism of enrichment: Growth on succinate could lead to a shortage/imbalance of certain amino acids, and investment in uptake might give a growth advantage.

Strains enriched in several conditions

A wealth of genes/mutants are enriched in multiple mainly pulsed growth conditions (cluster 4).

df_fitness_summary %>%
  filter(locus_tag %in% names(subset(list_enriched, list_enriched == 4))) %>%
  filter(Time == 16) %>%
  barchart_fitness() %>%
  print(position = c(0,0,0.6,1), more = TRUE)

df_fitness_summary %>%
  filter(locus_tag %in% names(subset(list_enriched, list_enriched == 4))) %>%
  linechart_fitness() %>%
  print(position = c(0.58,0,1,1))

To learn more about functional relationships between enriched genes/mutants, we can submit the gene list to the STRING interaction database and retrieve a network of probable interactions.

library(ggraph)
library(tidygraph)

# function retrieve network interaction data from STRING DB
# separate gene IDs by "%0d"; species/taxon ID for Cupriavidus necator H16: 381666
# (see https://string-db.org/cgi/organisms)
retrieve_STRING <- function(gene_ID, taxon_ID, min_score = 0000, ref = NULL) {
  gene_list <- paste(gene_ID, collapse = "%0d")
  string_graph <- paste0(
    "https://string-db.org/api/tsv/network?identifiers=", 
    gene_list, "&species=", taxon_ID, "&required_score=", min_score) %>%
  read_tsv(col_types = cols()) %>%
  mutate(across(matches("stringId"), function(x) gsub(paste0(taxon_ID, "."), "", x))) %>%
  as_tbl_graph()
  if (!is.null(ref)) {
    left_join(string_graph, ref, by = "name")
  } else {
    string_graph
  }
}
graph_all_enrich <- retrieve_STRING(
  gene_ID = names(subset(list_enriched, list_enriched == 4)),
  taxon_ID = 381666,
  ref = rename(df_ref, name = locus_tag)
)

graph_all_enrich %>% arrange(COG_Process) %>% activate(edges) %>% 
  filter(score >= 0.4) %>%
  ggraph(layout = 'linear', circular = TRUE) +
  geom_edge_arc(colour = grey(0.6, 0.5), aes(width = score)) + 
  geom_node_point(aes(colour = COG_Process), size = 5) +
  geom_node_text(nudge_y = 0.1, size = 3.2, aes(label = eggNOG_name, colour = COG_Process)) +
  scale_edge_width(range = c(0.2, 2)) +
  theme_graph(background = "white", foreground = grey(0.5),
    plot_margin = margin(10, 10, 10, 10))

Mechanisms enabling faster growth in several conditions

The picture for these strains is more clear then for the fructose-enriched strains. 13 out of 27 enriched genes (48%), among them some of the most enriched, are directly involved in hydrogenase activity, i.e. hydrogen oxygenation (hoxA, hoxF, hoxU, hoxX, hypA, hypB, hypC, hypD hypE, hypF, hyaA, hyaB, hupH). The mechanism of enabling faster growth by KO of hydrogenases is unknown.

The second theme is enrichment of ptsI (H16_A0326) and ptsH (H16_A0325), together the entire functional unit of the PTS system, the PEP-dependent sugar phosphotransferase system (sugar PTS). “This major carbohydrate active-transport system catalyzes the phosphorylation of incoming sugar substrates concomitantly with their translocation across the cell membrane. Enzyme I transfers the phosphoryl group from (PEP) to the phosphoryl carrier protein (HPr).” Why does the KO of PTS system provides a growth benefit in in all conditions with almost equal contribution? Must be related to energy metabolism, because neither fructose (ABC transporter), succinate and def not formate are transported via a PTS.

Other interesting mechanisms: KO of RNA polymerase sigma factor RpoS (H16_A2373) seems to have a beneficial effect on steady state growth, as well as other stress related transcription factors like cold shock protein capB (H16_B2205), and scoF (H16_B0002).

The gene H16_B2043 is by far most abundant (by reads) and most enriched (by fitness) single mutant in almost all conditions. What is the mechanism? There is almost nothing known about this gene except that it probably has cyclic-guanylate-specific phosphodiesterase activity. This enzyme has, according to PFAM, two domains, the diguanylate cyclase (GGDEF) and the EAL domain. The first one synthesizes cyclic di-GMP Ryjenkov et al., J Bact, 2005. The second domain can act as phosphodiesterase and cleave cyclic cAMP or cGMP. Both compounds play a role in signaling, cAMP for example as the hormone in catabolite repression. According to the discussion in the linked paper, “Mutations in the GGDEF domain proteins or overexpression of such proteins affect exopolysaccharide synthesis […] and formation of biofilms. In C. crescentus, flagellum ejection, which is required for the switch from motile to sessile lifestyle, is impaired in a mutated GGDEF domain”. The underlying mechanism could be disrupted biofilm formation, higher flagellum activity, and thus non-settling phenotype in the bioreactor. Alternatively the mutation affects cAMP formation and disrupts or enhances catabolite repression (more likely that disrupted catabolite repression is beneficial).

Sanity checks

All above results are based on fitness score, which is not much more than a normalized log2 fold change of read count over time. We can compare if the mutants with extremely high fitness scores are also the ones that are super-abundant at the final time point, ie.e. have an average read count of >= 500,000 after 16 generations. In fact, only 3 genes have such an extreme read count, but 9 have more than 100,000 reads.

df_fitness %>% filter(Time == 16, locus_tag %in% (subset(list_enriched, list_enriched %in% c(3,4)) %>% names)) %>%
  group_by(locus_tag) %>% summarize(max_counts = max(Counts), max_fitness = max(Norm_fg)) %>%
  arrange(desc(max_counts))

On the other hand, we can check which genes have an extremely high read count (>= 500,000 in any condition) and see if this correlates with high fitness score. It does partly. 3 out of 7 highly abundant gene mutants enrich extremely over time. The other enrich too, but only 2^1 = 2 to 2^3.5 = 11 times. These non-enriching super-abundant mutants are:

  • H16_A0774 - cphA, Cyanophycin synthase. Supposed mechanism: enriches in LB medium as it does not turn Asp into cyanophycin. No benefit on minimal medium.
  • H16_B2570 - fecA, Outer membrane receptor for Fe(III). Supposed mechanism: regulator activity together with fecR, but also transmembrane transporter for siderophores. KO reduces sensitivty to excess iron??
  • H16_A3145 - Conserved protein/domain typically associated with flavoprotein oxygenases, DIM6/NTAB family. Supposed mechanism: unknown.
df_fitness %>% filter(Time == 16, Counts > 5*10^5) %>%
  group_by(locus_tag) %>% summarize(max_counts = max(Counts), max_fitness = max(Norm_fg)) %>%
  arrange(desc(max_counts))

Depletion of mutants over time (reduced fitness)

Depletion on formate

Several genes/mutants are depleted over time depending on growth condition. Most interesting in this context are cluster 1 and 2, depletion only on formate, or in all conditions, respectively. Not surprisingly, formate-specific genes are highly depleted during growth on formate, but not cbb (Calvin cycle) genes.

df_fitness_summary %>%
  filter(locus_tag %in% names(subset(list_enriched, list_enriched == 1))) %>%
  filter(Time == 16) %>%
  barchart_fitness() %>%
  print(position = c(0,0,0.6,1), more = TRUE)

df_fitness_summary %>%
  filter(locus_tag %in% names(subset(list_enriched, list_enriched == 1))) %>%
  linechart_fitness() %>%
  print(position = c(0.58,0,1,1))

What is the role of formate-essential genes? Are there patterns that emerge? We can submit the depleted gene list to STRING DB and retrieve a possible network of interactions. The network shows immediately that there are many high-scoring interactions between the genes, particularly the soluble FDH genes are sticking out as one cluster (fdsABDG) and their transcriptional regulator fdsR. Then the molybdenum cofactor processing proteins moaCDE, moeA, mobA, mog. And a range of cytochrome genes responsible for acceptance and transport of of electrons from formate to the ETC in cytoplasmic membrane (petABC, ccoGNOP, cyc). It is also intersting what is NOT essential on formate, such as no cbb genes except the master transcriptional regulator cbbR. The reason for this must be the redundancy of all cbb genes (two operons).

graph_formate_depl <- retrieve_STRING(
  gene_ID = names(subset(list_enriched, list_enriched == 1)),
  taxon_ID = 381666,
  ref = rename(df_ref, name = locus_tag)
)

graph_formate_depl %>% arrange(COG_Process) %>% activate(edges) %>% 
  filter(score >= 0.4) %>%
  ggraph(layout = 'linear', circular = TRUE) +
  geom_edge_arc(colour = grey(0.6, 0.5), aes(width = score)) + 
  geom_node_point(aes(colour = COG_Process), size = 5) +
  geom_node_text(nudge_y = 0.1, size = 3.2, aes(label = eggNOG_name, colour = COG_Process)) +
  scale_edge_width(range = c(0.2, 2)) +
  theme_graph(background = "white", foreground = grey(0.5),
    plot_margin = margin(10, 10, 10, 10))

Depletion in all conditions

Which genes are depleted in all conditions (cluster 2)? Here we see the effect of the read compression, such that many genes seem to “recover” from initial depletion and enrich from 8 to 16 generation time point. This effect is only apparent for the continuous conditions and with very high probability related to the distortion of read counts by super-enriching mutants. The 8 generation time point should be considered as more reliable in those cases.

df_fitness_summary %>%
  filter(locus_tag %in% names(subset(list_enriched, list_enriched == 2))) %>%
  filter(Time == 16) %>%
  barchart_fitness() %>%
  print(position = c(0,0,0.6,1), more = TRUE)

df_fitness_summary %>%
  filter(locus_tag %in% names(subset(list_enriched, list_enriched == 2))) %>%
  linechart_fitness() %>%
  print(position = c(0.58,0,1,1))

graph_all_depl <- retrieve_STRING(
  gene_ID = names(subset(list_enriched, list_enriched == 2)),
  taxon_ID = 381666,
  ref = rename(df_ref, name = locus_tag)
)

graph_all_depl %>% arrange(COG_Process) %>% activate(edges) %>% 
  filter(score >= 0.4) %>%
  ggraph(layout = 'linear', circular = TRUE) +
  geom_edge_arc(colour = grey(0.6, 0.5), aes(width = score)) + 
  geom_node_point(aes(colour = COG_Process), size = 5) +
  geom_node_text(nudge_y = 0.1, size = 3.2, aes(label = eggNOG_name, colour = COG_Process)) +
  scale_edge_width(range = c(0.2, 2)) +
  theme_graph(background = "white", foreground = grey(0.5),
    plot_margin = margin(10, 10, 10, 10))

---
title: "Analysis of competition experiments with a barcoded transposon library"
date: "`r format(Sys.time(), '%d %B, %Y')`"
author: "Michael Jahn"
output:
  html_notebook: 
    theme: spacelab
    toc: yes
---

## Description

This R notebook is a bioinformatics pipeline to analyze fitness data obtained from a barcoded transposon library in *Ralstonia eutropha* a.k.a. *Cupriavidus necator*. For background and details regarding the method, see [Wetmore at al., mBio, 2015](https://mbio.asm.org/content/6/3/e00306-15) and [Price et al., Nature, 2018](http://www.nature.com/articles/s41586-018-0124-0)).


## Libraries

```{r, message = FALSE}
# optionally install repos from github
# devtools::install_github("m-jahn/lattice-tools")
# devtools::install_github("m-jahn/R-tools")
library(lattice)
library(latticeExtra)
library(latticetools)
library(tidyverse)
library(dendextend)
library(Rtools)
library(colorspace)
```


## Overview of barcode/transposon read counts

### Data import and processing

Read in the main data tables with A) reads per barcode and sample ('pool counts'), and B) the fitness tables. Tables were obtained by processing sequencing data with a custom [BarSeq pipeline](https://github.com/m-jahn/rebar). The 32 generation sequencing samples are removed due to the low read count in the `continuous` samples.

```{r, message = FALSE}
# import barseq counts data in wide format and reshape to long format
df_counts_frc <- read_tsv("../../../rebar/data/20201222_barseq_frc/results/result.poolcount") %>%
  select(!matches("32gen|_32_")) %>%
  pivot_longer(
    cols = !all_of(c("barcode", "rcbarcode", "scaffold", "strand", "pos")), 
    names_to = "sample", values_to = "n_reads")

df_counts_suc <- read_tsv("../../../rebar/data/20210407_barseq_suc_for/results/result.poolcount") %>%
  pivot_longer(
    cols = !all_of(c("barcode", "rcbarcode", "scaffold", "strand", "pos")), 
    names_to = "sample", values_to = "n_reads")

# merge barcode counts tables
df_counts <- bind_rows(df_counts_frc, df_counts_suc)

# import fitness data, the final output of the BarSeq pipeline
load("../../../rebar/data/20201222_barseq_frc/results/fitness_gene.Rdata")
df_fitness_frc <- fitness_gene %>%
  filter(Condition != "long pulse", Time != 32) %>%
  mutate(ID = as.numeric(ID), Substrate = "fructose", 
    Condition = str_remove(Condition, "short "))

load("../../../rebar/data/20210407_barseq_suc_for/results/fitness_gene.Rdata")
df_fitness_suc <- fitness_gene %>%
  separate(Condition, sep = "_", into = c("Substrate", "Condition"))

# merge fitness tables
df_fitness <- bind_rows(df_fitness_frc, df_fitness_suc) %>%
  rename(locus_tag = locusId)
rm("df_fitness_frc", "df_fitness_suc", "df_counts_frc", "df_counts_suc")

# import genome annotation
df_ref <- read_csv("../data/ref/Ralstonia_H16_genome_annotation.csv") %>%
  filter(!duplicated(locus_tag)) %>%
  mutate(eggNOG_name = if_else(is.na(eggNOG_name), gene_name, eggNOG_name))

# define standard colors
stdcol <- custom.colorblind()$superpose.line$col
```

### Summary statistics

Overview about the number of reads per barcode, barcodes per gene and so on. Around 8-10 M reads were mapped on average, per sample. 

```{r, fig.width = 8, fig.height = 8, message = FALSE}
# Number of total mapped reads
df_counts %>% group_by(sample) %>%
  summarize(n_million_reads = sum(n_reads)/10^6) %>%
  barchart(factor(sample) ~ n_million_reads, .,
    par.settings = custom.colorblind(),
    horizontal = TRUE, border = NULL,
    scales = list(y = list(cex = 0.7)),
    panel = function(x, y, ...) {
      panel.grid(h = -1, v = -1, col = grey(0.9))
      panel.barchart(x, y, ...)
    }
  )
```

Distribution of **number of reads per barcode**. There is a sufficient number of reads for quantification, on average log2(n) = 5 = 32 reads per barcode.

```{r, fig.width = 8, fig.height = 6.2}
plot_reads_per_bc <- histogram(~ log2(n_reads) | sample,
  df_counts, as.table = TRUE, layout = c(8,6),
  par.settings = custom.colorblind(),
  between = list(x = 0.5, y = 0.5),
  xlab = expression("log"[2]*" reads per barcode"),
  scales = list(alternating = FALSE),
  panel = function(x, ...) {
    panel.grid(h = -1, v = -1, col = grey(0.9))
    panel.histogram(x, border = "white", ...)
  }
)

print(plot_reads_per_bc)
```

Similarly to the above, this is an overview about the **number of barcodes per gene** as a histogram. This distribution is the same for all conditions and replicates. The second plot is the **number of reads per gene**, averaged as median over all conditions (excluding 0 time point where counts were averaged by BarSeq pipeline). The average reads per gene are log2(n) = 8, which translates to n = 2^8 or around 256 reads per gene.


```{r, fig.width = 8, fig.height = 4.2, message = FALSE}
plot_reads_per_bc <- histogram(~ Strains_per_gene,
  df_fitness %>% select(locus_tag, Strains_per_gene) %>% 
    distinct %>% filter(Strains_per_gene < 40), 
  par.settings = custom.colorblind(), breaks = 20, xlim=c(-2, 42),
  xlab = expression("mutants per gene"),
  panel = function(x, ...) {
    panel.grid(h = -1, v = -1, col = grey(0.9))
    panel.histogram(x, border = "white", ...)
    panel.ablineq(v = mean(x, na.rm = TRUE),
      fontfamily = "FreeSans", col = grey(0.5), lwd = 2, lty = 2)
  }
)

plot_reads_per_gene <- histogram(~ log2(reads_per_gene_median),
  df_fitness %>% filter(Time != 0) %>% group_by(locus_tag) %>%
    summarize(reads_per_gene_median = median(Counts)),
  par.settings = custom.colorblind(), breaks = 20,
  xlab = expression("log"[2]*" reads per gene (med)"),
  panel = function(x, ...) {
    panel.grid(h = -1, v = -1, col = grey(0.9))
    panel.histogram(x, border = "white", ...)
    panel.ablineq(v = mean(x[!is.infinite(x)]),
      fontfamily = "FreeSans", col = grey(0.5), lwd = 2, lty = 2)
  }
)

print(plot_reads_per_bc, split = c(1,1,2,1), more = TRUE)
print(plot_reads_per_gene, split = c(2,1,2,1))
```

## Gene fitness analysis

### Depletion over time (generations)

We can plot log2 FC or normalized gene fitness over generations. For this type of overview it is best to summarize individual replicates (4x) to the mean or median, per time point and condition. We also add genome annotation to the summary table. The plots shows that depletion of some strains is so strong already at 8 generations that fitness/log2 FC could not be quantified for 16 generations due to missing read counts. It is best to focus on 8 generations as because it provides a more complete picture.

```{r, messages = FALSE}
df_fitness_summary <- df_fitness %>%
  group_by(locus_tag, scaffold, Time, Condition, Substrate, Strains_per_gene) %>%
  summarize(
    Norm_fg_median = median(Norm_fg, na.rm = TRUE),
    log2FC_median = median(log2FC, na.rm = TRUE),
    tstat_median = median(t, na.rm = TRUE)
  ) %>%
  left_join(df_ref)
```


```{r, fig.width = 6.5, fig.height = 8}
plot_hist_log2FC <- df_fitness_summary %>%
  filter(all(!is.infinite(log2FC_median))) %>%
  group_by(Condition, Substrate) %>%
  slice(1:2000) %>%
  
  xyplot(log2FC_median ~ Time | Substrate * Condition, .,
    groups = locus_tag, as.table = TRUE,
    col = stdcol[1], alpha = 0.2, layout = c(3, 2),
    xlab = "", ylab = expression("log"[2]*" FC"),
    par.settings = custom.colorblind(), type = c("l"),
    between = list(x = 0.5, y = 0.5),
    scales = list(alternating = FALSE), lwd = 2,
    panel = function(x, y, ...) {
      panel.grid(h = -1, v = -1, col = grey(0.9))
      panel.xyplot(x, y, ...)
    }
  )

plot_hist_normfg <- df_fitness_summary %>%
  filter(all(!is.infinite(log2FC_median))) %>%
  group_by(Condition, Substrate) %>%
  slice(1:2000) %>%
  
  xyplot(Norm_fg_median ~ Time | Substrate * Condition, .,
    groups = locus_tag, as.table = TRUE,
    col = stdcol[2], alpha = 0.2, layout = c(3, 2),
    xlab = "generations", ylab = "fitness",
    par.settings = custom.colorblind(), type = c("l"),
    between = list(x = 0.5, y = 0.5),
    scales = list(alternating = FALSE), lwd = 2,
    panel = function(x, y, ...) {
      panel.grid(h = -1, v = -1, col = grey(0.9))
      panel.xyplot(x, y, ...)
    }
  )

print(plot_hist_log2FC, position = c(0,0.47,1,1), more = TRUE)
print(plot_hist_normfg, position = c(0,0,1,0.53))
```

As sort of an internal control, we compare the gene fitness obtained by a complex procedure to the log2 FC of read counts, which is a very simple measure of 'fitness'. The two variables correlate well all tested conditions and substrates.

```{r, fig.width = 6.5, fig.height = 5.2}
df_fitness_summary %>%
  filter(Time == 8, all(!is.infinite(log2FC_median))) %>%
  
  xyplot(Norm_fg_median ~ log2FC_median |  Substrate * Condition, .,
    as.table = TRUE, col = stdcol[5], pch = 19,
    alpha = 0.4, cex = 0.6,
    layout = c(3, 2), xlim = c(-8,3), ylim = c(-6,5),
    xlab = expression("log"[2]*" FC"), ylab = "fitness",
    par.settings = custom.colorblind(),
    between = list(x = 0.5, y = 0.5),
    scales = list(alternating = FALSE),
    panel = function(x, y, ...) {
      panel.grid(h = -1, v = -1, col = grey(0.9))
      panel.xyplot(x, y, ...)
      panel.lmlineq(x, y, fontfamily = "FreeSans", r.squared = TRUE, lwd = 1.5, 
        col.text = 1, pos = 3, offset = 5, ...)
      panel.abline(a = 2, b = 1, col = grey(0.5), lty = 2, lwd = 1.5)
    }
  )
```

### Comparing gene fitness between conditions

Similarly to Figure 2 in [Wetmore et al, mBIO, 2015](https://mbio.asm.org/content/6/3/e00306-15), we can investigate condition-dependent gene fitness by comparing conditions and substrates one-on-one. The correlation between substrates and growth regimes (continuous, pulsed) is quite different. The strongest correlation does exist for the pulsed vs continuous regime for each substrate (R = 0.64 to 0.87). And for the pulsed conditions, comparison between substrates (R = 0.62 to 0.72).

```{r, fig.width = 8, fig.height = 8, message = FALSE}
df_fitness_comp <- df_fitness_summary %>% filter(Time == 8) %>%
  group_by(locus_tag) %>% mutate(tstat_median = min(tstat_median)) %>%
  select(locus_tag, Condition, Substrate, Norm_fg_median, gene_name, COG_Process, tstat_median) %>%
  unite(Condition, Condition, Substrate) %>%
  pivot_wider(names_from = Condition, values_from = Norm_fg_median) %>%
  filter(!is.na(locus_tag))

custom_splom(df_fitness_comp %>% ungroup %>% 
  select(matches("conti|pulse")), col = grey(0.4, 0.4))
```

We can also compare selected conditions directly to identify genes enriched or depleted in several conditions.

```{r, fig.width = 8, fig.height = 8, message = FALSE}
# generalized plotting function
plot_fitness_comp <- function(data, vars){
  xyplot(get(vars[2]) ~ get(vars[1]), data,
    groups = abs(get(vars[1])-get(vars[2])) > 2, #tstat_median < -3
    par.settings = custom.colorblind(), col = stdcol[c(5,1)],
    pch = 19, alpha = 0.5, cex = 0.7, aspect = 1,
    xlim = c(-7, 4), ylim = c(-7, 4),
    xlab = vars[1], ylab = vars[2],
    panel = function(x, y, ...) {
      panel.grid(h = -1, v = -1, col = grey(0.9))
      panel.abline(v = 0, h = 0, col = grey(0.5), lty = 2, lwd = 1.5)
      panel.abline(a = 0, b = 1, col = grey(0.5), lty = 2, lwd = 1.5)
      panel.abline(a = 2, b = 1, col = grey(0.5), lty = 2, lwd = 1.5)
      panel.abline(a = -2, b = 1, col = grey(0.5), lty = 2, lwd = 1.5)
      panel.xyplot(x, y, ...)
      panel.key(...)
    }
  )
}

plot_fitness_comp(df_fitness_comp, c("continuous_formate", "pulse_formate")) %>%
  print(position = c(0,0.5,0.5,1), more = TRUE)
plot_fitness_comp(df_fitness_comp, c("continuous_fructose", "pulse_fructose")) %>%
  print(position = c(0.5,0.5,1,1), more = TRUE)
plot_fitness_comp(df_fitness_comp, c("continuous_succinate", "pulse_succinate")) %>%
  print(position = c(0,0,0.5,0.5), more = TRUE)
plot_fitness_comp(df_fitness_comp, c("continuous_formate", "continuous_fructose")) %>%
  print(position = c(0.5,0,1,0.5), more = TRUE)
```

### Differential fitness of selected genes

Most genes correlate in fitness value between conditions. That means, only some genes have a **condition-specific fitness effect**, i.e. increase or decrease fitness of the respective strain in one substrate or growth regime specifically. This section investigates specific genes and their functions that show such differential fitness between conditions. A simple comparison of one condition versus another will not be helpful as there two many possible combinations. To identify interesting sets of genes, cluster analysis can be performed.

```{r, fig.width = 10, fig.height = 2.5}
# generate colorpalette for heatmap
heat_cols <- colorspace::diverging_hcl(n = 7, h = c(260, 0), c = 100, l = c(50, 90))

# create a matrix from wide fitness data for plotting heatmap
mat_heatmap <- df_fitness_comp %>% ungroup %>%
  filter(if_any(.cols = matches("conti|pulse"), ~ !between(., -2, 2))) %>%
  select(matches("locus_tag|conti|pulse")) %>%
  # filter out NA rows, and replace extreme values
  drop_na %>% mutate(across(matches("conti|pulse"), 
    function(x) {y = replace(x, x > 6, 6); replace(y, y < -6, -6)})) %>%
  column_to_rownames(var = "locus_tag") %>%
  as.matrix

# create cluster for reordering
mat_cluster <- mat_heatmap %>% dist %>% hclust(method = "ward.D")
mat_heatmap <- mat_heatmap[order.dendrogram(as.dendrogram(mat_cluster)), c(1,4,2,5,3,6)]

# plot heatmap
plot_heatmap <- levelplot(mat_heatmap,
  par.settings = custom.colorblind(),
  col.regions = colorRampPalette(heat_cols)(16),
  at = seq(-6, 6, 1), aspect = "fill",
  xlab = "", ylab = "", scales = list(x = list(draw = FALSE)),
  panel = function(x, y, z, ...) {
    panel.levelplot(x, y, z, ...)
    panel.abline(h = 1:5+0.5, col = "white", lwd = 1.5)
  }
)

print(plot_heatmap)
```

```{r, include = FALSE}
# silently export dendrogram
png("../figures/figure_barseq_heat.png", width = 1690, height = 400, res = 150)
print(plot_heatmap)
dev.off()
```

Clustering and silhouette analysis revealed that we have 4 to 7 clusters with acceptable separation, see the plot below. Of these, several clusters stick out:

1. mutants specifically depleted in formate conditions = essential for formate
2. mutants depleted in all conditions = essential in minimal medium
3. mutants enriched in fructose growth, neutral on other substrates
4. mutants enriched in continuous regime, mostly neutral in pulsed regime
5. mutants _slightly_ depleted in continuous succinate - low priority
6. mutants _slightly_ depleted in continuous fructose - low priority
7. mutants _slightly_ depleted in various conditions - low priority

```{r, fig.width = 10, fig.height = 4}
silhouetteResult <- Rtools::silhouette_analysis(mat_heatmap, mat_cluster, 2:20)
silhouetteResult$plot.summary

# plot colored dendrogram
plot(color_branches(
  mat_cluster, k = 7,
  groupLabels = TRUE,
  col = stdcol[1:7]
))

# collect names of enriched/depleted genes
list_enriched <- vegan::cutreeord(mat_cluster, k = 7)
list_enriched %>% table
```

```{r, include = FALSE}
# silently export dendrogram
png("../figures/figure_barseq_dendro.png", width = 1600, height = 700, res = 150)
plot(color_branches(
  mat_cluster, k = 7,
  groupLabels = TRUE,
  col = stdcol[1:7]
))
dev.off()
```


### Enrichment of mutants over time (increased fitness)

One of the most puzzling results is the appearance of (very) fast growing mutants in some conditions. These mutants quickly enrich over only 16 generations to a volume of up to 20% of all library mutants, they "take over the culture". This phenomenon was not observed previously in experiments with the *Synechocystis* sp. PCC6803 CRISPRi repression library (Yao et al., Nature Communications, 2020). The CRISPRi library is based on the same principle of depletion and enrichment of mutants depending on the associated fitness.

```{r, echo = FALSE}
# color vector for conditions
subs_col <- c(stdcol[1], lighten(stdcol[1], 0.3), stdcol[2],
  lighten(stdcol[2], 0.3), stdcol[3], lighten(stdcol[3], 0.3))

# generalized function for barplots
barchart_fitness <- function(data) {
  # sort by cumulative fitness
  data %>% group_by(locus_tag) %>% 
  mutate(sum_fitness = sum(Norm_fg_median)) %>%
  arrange(sum_fitness) %>%
  mutate(name = paste0(locus_tag, " | ", eggNOG_name, " | ", 
    substr(eggNOG_description, 1, 40))) %>%
  
  barchart(factor(name, unique(name)) ~ Norm_fg_median, .,
    group = paste(Substrate, Condition),
    par.settings = custom.colorblind(),
    col = subs_col, border = "white", stack = TRUE,
    xlab = "cumulative fitness",
    scales = list(y = list(cex = 0.7)),
    panel = function(x, y, ...) {
      panel.grid(h = -1, v = -1, col = grey(0.9))
      panel.barchart(x, y, ...)
      panel.key(..., points = FALSE, cex = 0.6, corner = c(0.97, 0.03))
    }
  )
}

linechart_fitness <- function(data) {
  xyplot(Norm_fg_median ~ Time | locus_tag, data,
    groups = paste(Substrate, Condition), as.table = TRUE,
    par.settings = custom.colorblind(), type = c("l"),
    col = subs_col, lwd = 2.5,
    layout = c(5, ceiling(length(unique(data$locus_tag))/5)),
    xlab = "generations", ylab = "fitness",
    between = list(x = 0.5, y = 0.5),
    scales = list(alternating = FALSE),
    panel = function(x, y, ...) {
      panel.grid(h = -1, v = -1, col = grey(0.9))
      panel.xyplot(x, y, ...)
    }
  )
}
```

#### Strains enriched on fructose

What is the identity of the enriched genes? Which ones are enriched in several conditions? First we identify the mutants highly and specifically enriched on fructose, predominantly in the continuous growth regime.


```{r, fig.width = 10, fig.height = 4.5}
df_fitness_summary %>%
  filter(locus_tag %in% names(subset(list_enriched, list_enriched == 3))) %>%
  filter(Time == 16) %>%
  barchart_fitness() %>%
  print(position = c(0,0,0.6,1), more = TRUE)

df_fitness_summary %>%
  filter(locus_tag %in% names(subset(list_enriched, list_enriched == 3))) %>%
  linechart_fitness() %>%
  print(position = c(0.58,0,1,1))
```

**Mechanisms enabling faster growth on fructose**

The mechanism for increased growth on fructose seems clearly different from the mechanism for faster growth in all other (mainly continuous)  conditions. For the fructose set, several genes are functionally linked suggesting a similar mechanism of action, 

1. Genes `H16_B0517` (alcohol dehydrogenase) and `H16_B1917` (aldehyde dehydrogenase): According to STRING DB, the genes are co-located i.e. either in direct proximity or in the same operon in alpha-, beta- and gamma-proteobacteria, and even cyanobcateria. BioCyc has 17 possible iso-enzymes in *Cupriavidus* for the 2-enzyme pathway "ethanol degradation 1": ethanol --> acetaldehyde --> acetyl-CoA (reverse EtOH fermentation, obtain 1 NADH per reaction) catalyzed by both genes. What could be the mechanism for improved growth/yield on fructose? Chemostats are more yield- than growth selective, this pathway is probably functional and secretes EtOH as a fermentation product; KO of this pathway probably increases yield.

2. Two genes are involved in nitrate respiration/reduction, `PHG269` or narK, and `H16_B2087`, a NarL-family response regulator. NarK is a nitrate transporter, and NarL senses nitrate and then activates transcription of its targets by binding the DNA. According to [this paper on NarL by Ruanto et al., 2020](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7419079/), "the Regulon DB database [11] for transcription regulation in E. coli lists 26 gene regulatory regions where NarL has a direct effect on transcript initiation, and it functions as an activator at 11 of these". It is not clear what effect the KO of nitrate transport could have on growth or yield, nitrate is not part of the medium.

3. Genes involved in translation/protein folding/sulphur metabolism:
    - `H16_B2521`, Alpha-ketoglutarate-dependent taurine dioxygenase; taurine is a sulphur carrier
    - `H16_A2861`, Glutathione S-transferase; Posttranslational modification, protein turnover, chaperones
    - `H16_A1138`, Thioredoxin; Posttranslational modification, protein turnover, chaperones
    - `H16_A1336`, yogA/tdcF, putative translation initiation inhibitor,yjg Ffamily; Translation, ribosomal structure and biogenesis
    - `H16_A2553`, cO, DNA repair protein RecO; Involved in DNA repair and RecF pathway recombination

4. Several other gene products are presumably regulatory proteins:
    - `16_A0310`, transcriptional regulator, GntR-family; Transcription
    - `H16_B2215`, HTH lysR-type domain-containing protein; helix turn helix DNA binding motif

#### Strains enriched on formate

Here, the obvious question with biotechnological importance is to find mutants with higher (specific) tolerance to formate, regardless if it's pulsed or continuously supplied. We explicitly exclude mutants beneficial for other growth conditions too, which are dealt with in the next section. The clustering results of the previous section could not identify formate-specific mutants enriched after 8 generations. We need to lower the filter threshold to detect genes that have a more subtle enrichment (fitness >= 2 for formate, and <= 1 for all other conditions) after 16 generations.

```{r, fig.width = 10, fig.height = 4.5}
list_fitness_formate <- df_fitness_summary %>%
  group_by(locus_tag, Substrate, Time) %>%
  mutate(formate_hit = case_when(
    Substrate == "formate" & max(Norm_fg_median) >= 2 ~ TRUE,
    Substrate != "formate" & max(Norm_fg_median) <= 1 ~ TRUE,
    TRUE ~ FALSE)
  ) %>% group_by(locus_tag, Time) %>%
  filter(all(formate_hit), n() == 6) %>% 
  pull(locus_tag) %>% unique

df_fitness_summary %>%
  filter(Time == 16, locus_tag %in% list_fitness_formate) %>%
  barchart_fitness() %>%
  print(position = c(0,0,0.6,1), more = TRUE)

df_fitness_summary %>%
  filter(locus_tag %in% list_fitness_formate) %>%
  linechart_fitness() %>%
  print(position = c(0.58,0,1,1))
```

**Mechanisms enabling faster growth on formate**

According to STRING DB there are no obvious relationships between these genes except for the pair `PHG388` (pcaI, Oxoadipate CoA transferase alpha subunit) and `H16_B1583` (pcaD, 3-Oxoadipate enol-lactone hydrolase). Both genes are part of the catechol degradation pathway that is itself a larger part of benzene degradation. The gene products catalyze two sequential reactions 3-oxoadipate-enol-lactone --> 3-oxoadipate (pcaD) --> 3-oxoadipyl-CoA (pcaI). The next reaction catalyzed by four isoenzymes makes succinyl-CoA which ends up in the Citrate cycle. How the KO of this pathway would improve tolerance to formate is unclear.

#### Strains enriched on succinate

This is similar to the analysis above for formate, with the exception that many mutants associated with higher fitness in succinate conditions enrich only after 16 generations, and only in the continuous regime. This seems be an artifact of read compression. We therefore use the same thresholds as for formate (fitness >= 2 for succinate, and <= 1 for all other conditions) but after 8 instead of 16 generations.

```{r, fig.width = 10, fig.height = 4.5}
list_fitness_succinate <- df_fitness_summary %>%
  filter(Time == 8) %>%
  group_by(locus_tag, Substrate) %>%
  mutate(succinate_hit = case_when(
    Substrate == "succinate" & max(Norm_fg_median) >= 2 ~ TRUE,
    Substrate != "succinate" & max(Norm_fg_median) <= 1 ~ TRUE,
    TRUE ~ FALSE)
  ) %>% group_by(locus_tag) %>%
  filter(all(succinate_hit), n() == 6, locus_tag != "H16_A2912") %>% 
  pull(locus_tag) %>% unique

df_fitness_summary %>%
  filter(Time == 16, locus_tag %in% list_fitness_succinate) %>%
  barchart_fitness() %>%
  print(position = c(0,0,0.6,1), more = TRUE)

df_fitness_summary %>%
  filter(locus_tag %in% list_fitness_succinate) %>%
  linechart_fitness() %>%
  print(position = c(0.58,0,1,1))
```

**Mechanisms enabling faster growth on succinate**

A group of five highly related genes shows significant enrichment in (continuous) succinate: `H16_2096` (dppA1), `H16_A2406` (dppD2), `H16_A2407` (dppC2), `H16_A2408` (dppB2), and `H16_A2409`, dppA2. Alternative names are yejABCDE. All of these proteins are part of an ABC transporter of the PepT family. Quote from PFAM about this type of protein/domain: "All characterised members appear to be involved in the transport of oligopeptides or dipeptides. Some are important for sporulation or antibiotic resistance. Some dipeptide transporters also act on the heme precursor delta-aminolevulinic acid.
The enrichment seems to be highly reproducible: each of these genes has 7 to 15 different insertion mutants, all showing the same average pattern. At least two more genes are also involved in (amino acid?) transport, `H16_A3284` and `H16_A2521`. The latter one is a "D-amino acid transferase (D-AAT), required by bacteria to catalyse the synthesis of D-glutamic acid and D-alanine" (PFAM).
Supposed mechanism of enrichment: Growth on succinate could lead to a shortage/imbalance of certain amino acids, and investment in uptake might give a growth advantage.

#### Strains enriched in several conditions

A wealth of genes/mutants are enriched in multiple mainly pulsed growth conditions (cluster 4).

```{r, fig.width = 10, fig.height = 6.5}
df_fitness_summary %>%
  filter(locus_tag %in% names(subset(list_enriched, list_enriched == 4))) %>%
  filter(Time == 16) %>%
  barchart_fitness() %>%
  print(position = c(0,0,0.6,1), more = TRUE)

df_fitness_summary %>%
  filter(locus_tag %in% names(subset(list_enriched, list_enriched == 4))) %>%
  linechart_fitness() %>%
  print(position = c(0.58,0,1,1))
```

To learn more about functional relationships between enriched genes/mutants, we can submit the gene list to the STRING interaction database and retrieve a network of probable interactions.

```{r, message = FALSE}
library(ggraph)
library(tidygraph)

# function retrieve network interaction data from STRING DB
# separate gene IDs by "%0d"; species/taxon ID for Cupriavidus necator H16: 381666
# (see https://string-db.org/cgi/organisms)
retrieve_STRING <- function(gene_ID, taxon_ID, min_score = 0000, ref = NULL) {
  gene_list <- paste(gene_ID, collapse = "%0d")
  string_graph <- paste0(
    "https://string-db.org/api/tsv/network?identifiers=", 
    gene_list, "&species=", taxon_ID, "&required_score=", min_score) %>%
  read_tsv(col_types = cols()) %>%
  mutate(across(matches("stringId"), function(x) gsub(paste0(taxon_ID, "."), "", x))) %>%
  as_tbl_graph()
  if (!is.null(ref)) {
    left_join(string_graph, ref, by = "name")
  } else {
    string_graph
  }
}
```


```{r, fig.width = 8, fig.height = 4.8}
graph_all_enrich <- retrieve_STRING(
  gene_ID = names(subset(list_enriched, list_enriched == 4)),
  taxon_ID = 381666,
  ref = rename(df_ref, name = locus_tag)
)

graph_all_enrich %>% arrange(COG_Process) %>% activate(edges) %>% 
  filter(score >= 0.4) %>%
  ggraph(layout = 'linear', circular = TRUE) +
  geom_edge_arc(colour = grey(0.6, 0.5), aes(width = score)) + 
  geom_node_point(aes(colour = COG_Process), size = 5) +
  geom_node_text(nudge_y = 0.1, size = 3.2, aes(label = eggNOG_name, colour = COG_Process)) +
  scale_edge_width(range = c(0.2, 2)) +
  theme_graph(background = "white", foreground = grey(0.5),
    plot_margin = margin(10, 10, 10, 10))
```

**Mechanisms enabling faster growth in several conditions**

The picture for these strains is more clear then for the fructose-enriched strains. 13 out of 27 enriched genes (48%), among them some of the most enriched, are directly involved in hydrogenase activity, i.e. hydrogen oxygenation (hoxA, hoxF, hoxU, hoxX, hypA, hypB, hypC, hypD hypE, hypF, hyaA, hyaB, hupH). The mechanism of enabling faster growth by KO of hydrogenases is unknown.

The second theme is enrichment of ptsI (`H16_A0326`) and ptsH (`H16_A0325`), together the entire functional unit of the PTS system, the PEP-dependent sugar phosphotransferase system (sugar PTS). "This major carbohydrate active-transport system catalyzes the phosphorylation of incoming sugar substrates concomitantly with their translocation across the cell membrane. Enzyme I transfers the phosphoryl group from (PEP) to the phosphoryl carrier protein (HPr)."
Why does the KO of PTS system provides a growth benefit in *in all conditions* with almost equal contribution? Must be related to energy metabolism, because neither fructose (ABC transporter), succinate and def not formate are transported via a PTS.

Other interesting mechanisms: KO of RNA polymerase sigma factor RpoS (`H16_A2373`) seems to have a beneficial effect on steady state growth, as well as other stress related transcription factors like cold shock protein capB (`H16_B2205`), and scoF (`H16_B0002`).

The gene `H16_B2043` is by far most abundant (by reads) and most enriched (by fitness) single mutant in almost all conditions. What is the mechanism? There is almost nothing known about this gene except that it probably has cyclic-guanylate-specific phosphodiesterase activity. This enzyme has, according to PFAM, two domains, the diguanylate cyclase (GGDEF) and the EAL domain. The first one synthesizes cyclic di-GMP [Ryjenkov et al., J Bact, 2005](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1064016/). The second domain can act as phosphodiesterase and cleave cyclic cAMP or cGMP. Both compounds play a role in signaling, cAMP for example as the hormone in catabolite repression. According to the discussion in the linked paper, "Mutations in the GGDEF domain proteins or overexpression of such proteins affect exopolysaccharide synthesis [...] and formation of biofilms. In C. crescentus, flagellum ejection, which is required for the switch from motile to sessile lifestyle, is impaired in a mutated GGDEF domain". The underlying mechanism could be disrupted biofilm formation, higher flagellum activity, and thus non-settling phenotype in the bioreactor. Alternatively the mutation affects cAMP formation and disrupts or enhances catabolite repression (more likely that disrupted catabolite repression is beneficial).


#### Sanity checks

All above results are based on fitness score, which is not much more than a normalized log2 fold change of read count over time.
We can compare if the mutants with extremely high fitness scores are also the ones that are super-abundant at the final time point, ie.e. have an average read count of >= 500,000 after 16 generations. In fact, only 3 genes have such an extreme read count, but 9 have more than 100,000 reads.

```{r}
df_fitness %>% filter(Time == 16, locus_tag %in% (subset(list_enriched, list_enriched %in% c(3,4)) %>% names)) %>%
  group_by(locus_tag) %>% summarize(max_counts = max(Counts), max_fitness = max(Norm_fg)) %>%
  arrange(desc(max_counts))
```

On the other hand, we can check which genes have an extremely high read count (>= 500,000 in *any* condition) and see if this correlates with high fitness score. It does partly. 3 out of 7 highly abundant gene mutants enrich extremely over time. The other enrich too, but only 2^1 = 2 to 2^3.5 = 11 times. These non-enriching super-abundant mutants are:

  - `H16_A0774` - cphA, Cyanophycin synthase. Supposed mechanism: enriches in LB medium as it does not turn Asp
    into cyanophycin. No benefit on minimal medium.
  - `H16_B2570` - fecA, Outer membrane receptor for Fe(III). Supposed mechanism: regulator activity together with fecR, but also 
    transmembrane transporter for siderophores. KO reduces sensitivty to excess iron??
  - `H16_A3145` - Conserved protein/domain typically associated with flavoprotein oxygenases, DIM6/NTAB family. 
    Supposed mechanism: unknown.

```{r}
df_fitness %>% filter(Time == 16, Counts > 5*10^5) %>%
  group_by(locus_tag) %>% summarize(max_counts = max(Counts), max_fitness = max(Norm_fg)) %>%
  arrange(desc(max_counts))
```


### Depletion of mutants over time (reduced fitness)

#### Depletion on formate

Several genes/mutants are depleted over time depending on growth condition. Most interesting in this context are cluster 1 and 2, depletion only on formate, or in all conditions, respectively. Not surprisingly, formate-specific genes are highly depleted during growth on formate, but not cbb (Calvin cycle) genes.


```{r, fig.width = 10, fig.height = 6.5}
df_fitness_summary %>%
  filter(locus_tag %in% names(subset(list_enriched, list_enriched == 1))) %>%
  filter(Time == 16) %>%
  barchart_fitness() %>%
  print(position = c(0,0,0.6,1), more = TRUE)

df_fitness_summary %>%
  filter(locus_tag %in% names(subset(list_enriched, list_enriched == 1))) %>%
  linechart_fitness() %>%
  print(position = c(0.58,0,1,1))
```

What is the role of formate-essential genes? Are there patterns that emerge? We can submit the depleted gene list to STRING DB and retrieve a possible network of interactions. The network shows immediately that there are many high-scoring interactions between the genes, particularly the soluble FDH genes are sticking out as one cluster (fdsABDG) and their transcriptional regulator fdsR. Then the molybdenum cofactor processing proteins moaCDE, moeA, mobA, mog. And a range of cytochrome genes responsible for acceptance and transport of of electrons from formate to the ETC in cytoplasmic membrane (petABC, ccoGNOP, cyc). It is also intersting what is NOT essential on formate, such as no cbb genes except the master transcriptional regulator cbbR. The reason for this must be the redundancy of all cbb genes (two operons).


```{r, fig.width = 8, fig.height = 4.8}
graph_formate_depl <- retrieve_STRING(
  gene_ID = names(subset(list_enriched, list_enriched == 1)),
  taxon_ID = 381666,
  ref = rename(df_ref, name = locus_tag)
)

graph_formate_depl %>% arrange(COG_Process) %>% activate(edges) %>% 
  filter(score >= 0.4) %>%
  ggraph(layout = 'linear', circular = TRUE) +
  geom_edge_arc(colour = grey(0.6, 0.5), aes(width = score)) + 
  geom_node_point(aes(colour = COG_Process), size = 5) +
  geom_node_text(nudge_y = 0.1, size = 3.2, aes(label = eggNOG_name, colour = COG_Process)) +
  scale_edge_width(range = c(0.2, 2)) +
  theme_graph(background = "white", foreground = grey(0.5),
    plot_margin = margin(10, 10, 10, 10))
```


#### Depletion in all conditions

Which genes are depleted in all conditions (cluster 2)? Here we see the effect of the read compression, such that many genes seem to "recover" from initial depletion and enrich from 8 to 16 generation time point. This effect is only apparent for the continuous conditions and with very high probability related to the distortion of read counts by super-enriching mutants. The 8 generation time point should be considered as more reliable in those cases.

```{r, fig.width = 10, fig.height = 6.5}
df_fitness_summary %>%
  filter(locus_tag %in% names(subset(list_enriched, list_enriched == 2))) %>%
  filter(Time == 16) %>%
  barchart_fitness() %>%
  print(position = c(0,0,0.6,1), more = TRUE)

df_fitness_summary %>%
  filter(locus_tag %in% names(subset(list_enriched, list_enriched == 2))) %>%
  linechart_fitness() %>%
  print(position = c(0.58,0,1,1))
```

```{r, fig.width = 8, fig.height = 5.2}
graph_all_depl <- retrieve_STRING(
  gene_ID = names(subset(list_enriched, list_enriched == 2)),
  taxon_ID = 381666,
  ref = rename(df_ref, name = locus_tag)
)

graph_all_depl %>% arrange(COG_Process) %>% activate(edges) %>% 
  filter(score >= 0.4) %>%
  ggraph(layout = 'linear', circular = TRUE) +
  geom_edge_arc(colour = grey(0.6, 0.5), aes(width = score)) + 
  geom_node_point(aes(colour = COG_Process), size = 5) +
  geom_node_text(nudge_y = 0.1, size = 3.2, aes(label = eggNOG_name, colour = COG_Process)) +
  scale_edge_width(range = c(0.2, 2)) +
  theme_graph(background = "white", foreground = grey(0.5),
    plot_margin = margin(10, 10, 10, 10))
```

```{r, include = FALSE}
# export genes that are essential for growth on fructose (but not LB)
# df_fitness_comp %>%
#   filter(continuous <= -2 & `short pulse` <= -2 & `long pulse` <= -2) %>%
#   write_csv("../data/output/essentiality_fructose.csv")
```
